Analyse der Matoma-HaNS-Daten

1 Setup

1.1 R-Pakete starten

Show the code
library(targets)
library(tidyverse)
library(ggokabeito)
library(easystats)
library(gt)
library(ggfittext)
library(scales)
Show the code
theme_set(theme_minimal())

1.2 Roh-Daten laden und inspizieren

JSON-Daten wurden nicht importiert, da offenbar nur redundante Daten enthalten sind.

Show the code
tar_load(data_all_fct)

1.2.1 Dimension

Der Roh-Datensatz verfügt über

  • 3623 Zeilen
  • 7873 Spalten (Dubletten und Spalten mit Bildern bereits entfernt)

Jede Zeile entspricht einem “Visit”.

1.3 Datensatz nur User

Entfernt man Developer, Admins und Lecturers aus dem Roh-Datensatz so bleiben weniger Zeilen übrig:

Show the code
tar_load(data_users_only)
  • 3389 Zeilen
  • 7873 Spalten

1.4 Datensatz mit Anzahl der Aktionen pro User

Show the code
tar_load(count_action)

1.5 Zeitraum (Beginn, Ende) der Daten

Show the code
tar_load(paths)

Laut config.yaml ist das aktuelle Semester 24-ss.

Show the code
tar_load(time_minmax)
Show the code
time_minmax |> 
  summarise(time_min = min(time_min),
            time_max = max(time_max)) |> 
  gt()
time_min time_max
2024-03-04 09:40:13 2024-06-06 11:06:14

Diese Statistik wurde auf Basis des Datenobjekts data_slim berechnet.

1.6 Statistiken

1.6.1 Mit den 499er-Daten

Show the code
count_action |> 
  describe_distribution(n_max) |> 
  gt() |> 
  fmt_number(columns = where(is.numeric),
             decimals = 2)
Variable Mean SD IQR Min Max Skewness Kurtosis n n_Missing
n_max 657.18 1,247.39 679.00 10.00 11,816.00 3.43 15.79 3,187.00 0.00

1.6.2 Ohne die 499er-Daten

Show the code
count_action2 <- 
count_action |> 
  filter(n_max != 499) 

count_action2 |> 
  describe_distribution(n_max) |> 
  gt() |> 
  fmt_number(columns = where(is.numeric),
             decimals = 2)
Variable Mean SD IQR Min Max Skewness Kurtosis n n_Missing
n_max 657.23 1,247.58 681.25 10.00 11,816.00 3.43 15.78 3,186.00 0.00

1.7 Verteilung

1.7.1 Mit den 499er-Daten

Show the code
count_action_avg = mean(count_action$n_max)
count_action_sd = sd(count_action$n_max)

count_action |> 
  ggplot() +
  geom_histogram(aes(x = n_max)) +
  labs(x = "Anzahl von Aktionen pro Visit",
       y = "n",
       caption = "Der vertikale Strich zeigt den Mittelwert; der horizontale die SD") +
  theme_minimal() +
  geom_vline(xintercept = count_action_avg,
             color = palette_okabe_ito()[1]) +
  geom_segment(x = count_action_avg-count_action_sd,
               y = 0,
               xend = count_action_avg + count_action_sd,
               yend = 0,
               color = palette_okabe_ito()[2],
               size = 2) +
  annotate("label", x = count_action_avg, y = 1500, label = "MW") +
  annotate("label", x = count_action_avg + count_action_sd, y = 0, label = "SD")

Show the code
  #geom_label(aes(x = count_action_avg), y = 1, label = "Mean")
  • Mittelwert der Aktionen pro Visit: 657.18.
  • SD der Aktionen pro Visit: 1247.39.

1.7.2 Ohne 499er-Daten

Show the code
count_action_avg2 = mean(count_action2$n_max)
count_action_sd2 = sd(count_action2$n_max)

count_action2 |> 
  ggplot() +
  geom_histogram(aes(x = n_max)) +
  labs(x = "Anzahl von Aktionen pro Visit",
       y = "n",
       title = "Verteilung der User-Aktionen pro Visit",
       caption = "Der vertikale Strich zeigt den Mittelwert; der horizontale die SD") +
  theme_minimal() +
  geom_vline(xintercept = count_action_avg2,
             color = palette_okabe_ito()[1]) +
  geom_segment(x = count_action_avg-count_action_sd2,
               y = 0,
               xend = count_action_avg2 + count_action_sd2,
               yend = 0,
               color = palette_okabe_ito()[2],
               size = 2) +
  annotate("label", x = count_action_avg2, y = 1500, label = "MW", vjust = "top") +
  annotate("label", x = count_action_avg2 + count_action_sd2, y = 0, label = "SD", vjust = "bottom")

Show the code
  #geom_label(aes(x = count_action_avg), y = 1, label = "Mean")
  • Mittelwert der Aktionen pro Visit: 657.23.
  • SD der Aktionen pro Visit: 1247.58.

2 Zeit pro Visit

Die Visit-Zeit wurde auf 600 Min. begrenzt.

Show the code
tar_load(time_spent)
Show the code
time_spent <- 
  time_spent |> 
  mutate(t_min = as.numeric(time_diff, units = "mins")) |> 
  filter(t_min < 600)

2.1 Verweildauer-Statistiken in Sekunden

Show the code
time_spent |> 
  summarise(
    mean_time_diff = round(mean(time_diff), 2),
    sd_time_diff = sd(time_diff),
    min_time_diff = min(time_diff),
    max_time_diff = max(time_diff)
  ) |> 
  gt() |> 
  fmt_number(columns = everything(),
             decimals = 2)
mean_time_diff sd_time_diff min_time_diff max_time_diff
1329.38 2,212.42 0 19946

2.2 Verweildauer-Statistiken in Minuten

Show the code
time_spent |> 
  summarise(
    mean_t_min = mean(t_min),
    sd_t_min = sd(t_min),
    min_t_min = min(t_min),
    max_t_min = max(t_min)
  ) |> 
  gt() |> 
  fmt_number(columns = everything(),
             decimals = 2)
mean_t_min sd_t_min min_t_min max_t_min
22.16 36.87 0.00 332.43

2.3 Visualisierung der Verweildauer

2.3.1 bins=20

Show the code
time_spent |> 
  ggplot(aes(x = t_min)) +
  geom_histogram() +
  scale_x_time() +
  theme_minimal() +
  labs(y = "n",
       x = "Verweildauer in HaNS pro Visit in Minuten")

2.3.2 bins=100

Show the code
time_spent |> 
  ggplot(aes(x = t_min)) +
  geom_histogram(binwidth = 5) +
  theme_minimal() +
  labs(y = "n",
       x = "Verweildauer in Minuten",
       title = "Verweildauer in HaNS pro Visit",
       caption = "binwidth = 5 Min.")

2.3.3 Zeitdauer begrenzt auf 1-120 Min.

Show the code
time_spent2 <- 
time_spent |> 
  filter(t_min > 1, t_min < 120) 

time_spent2 |> 
  ggplot(aes(x = t_min)) +
  geom_histogram(binwidth = 10) +
  theme_minimal() +
  labs(y = "n",
       x = "Verweildauer in HaNS pro Visit in Minuten",
       title = "Verweildauer begrenzt auf 1-120 Minuten",
       caption = "bindwidth = 10 Min.")

3 Was machen die User?

Show the code
tar_load(count_action_type)

3.1 Statistiken

Show the code
count_action_type |> 
  count(category, sort = TRUE) |> 
  gt()
category n
NA 1032265
video 138257
click_slideChange 11116
visit_page 10486
login 866
click_topic 829
Search Results Count 813
in_media_search 620
Kanäle 512
Medien 415
click_channelcard 293
GESOA 260

3.2 Verteilung

3.2.1 Rohwerte

Show the code
count_action_type |> 
  count(category, sort = TRUE) |> 
  ggplot(aes(y = reorder(category, n), x = n)) +
  geom_col() +
  geom_bar_text() +
  labs(
    x = "User-Aktion",
    y = "Aktion",
    title = "Anzahl der User-Aktionen nach Kategorie"
  ) +
  theme_minimal() +
  scale_x_continuous(labels = scales::comma)

3.2.2 Log-Skalierung

Show the code
count_action_type |> 
  count(category, sort = TRUE) |> 
  ggplot(aes(y = reorder(category, n), x = n)) +
  geom_col() +
  geom_bar_text() +
  labs(
    x = "Anazhl der User-Aktionen",
    y = "Aktion",
    title = "Anzahl der User-Aktionen nach Kategorie",
    caption = "Log10-Skala"
  ) +
  theme_minimal() +
  scale_x_log10()

4 An welchen Tagen und zu welcher Zeit kommen die User zu HaNS?

4.1 Setup

Show the code
tar_load(time_visit_wday)
Show the code
# Define a vector with the names of the days of the week
# Note: Adjust the start of the week (Sunday or Monday) as per your requirement
days_of_week <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")

# Replace numbers with day names
time_visit_wday$dow2 <- factor(days_of_week[time_visit_wday$dow],
                               levels = days_of_week)

4.2 HaNS-Login nach Uhrzeit

Show the code
time_visit_wday |> 
  as_tibble() |> 
  count(hour) |> 
  mutate(prop = n/sum(n)) |> 
  ggplot(aes(x = hour, y = prop)) +
  geom_col() +
  theme_minimal() +
  labs(
    title = "HaNS-Nutzer sind keine Frühaufsteher",
    x = "Uhrzeit",
    y = "Anteil"
  )

Show the code
 # coord_polar()
Show the code
time_visit_wday |> 
  as_tibble() |> 
  count(hour) |> 
  mutate(prop = n/sum(n)) |> 
  ggplot(aes(x = hour, y = prop)) +
  geom_col() +
  theme_minimal() +
  coord_polar()

4.3 Verteilung der HaNS-Besuche nach Wochentagen

Show the code
time_visit_wday |> 
  as_tibble() |> 
  count(dow2) |> 
  mutate(prop = n/sum(n)) |> 
  ggplot(aes(x = dow2, y = prop)) +
  geom_col() +
  theme_minimal() +
  labs(title = "Verteilung der HaNS-Logins nach Wochentagen",
       x = "Wochentag",
       y = "Anteil")

Show the code
 # coord_polar()
Show the code
time_visit_wday |> 
  as_tibble() |> 
  count(dow2) |> 
  mutate(prop = n/sum(n)) |> 
  ggplot(aes(x = dow2, y = prop)) +
  geom_col() +
  theme_minimal() +
  labs(title = "Verteilung der HaNS-Logins nach Wochentagen",
       x = "Wochentag",
       y = "Anteil")  +
  coord_polar()

4.3.1 HaNS-Login nach Wochentagen Uhrzeit

Show the code
time_visit_wday |> 
  as_tibble() |> 
  count(dow2, hour) |> 
  group_by(dow2) |> 
  mutate(prop = n/sum(n)) |> 
  ggplot(aes(x = hour, y = prop)) +
  geom_col() +
  facet_wrap(~ dow2) +
  theme_minimal() +
  labs(title = "Verteilung der HaNS-Logins nach Wochentagen und Uhrzeiten",
       x = "Wochentag",
       y = "Anteil")

Show the code
 # coord_polar()
Show the code
time_visit_wday |> 
  as_tibble() |> 
  count(dow2, hour) |> 
  group_by(dow2) |> 
  mutate(prop = n/sum(n)) |> 
  ggplot(aes(x = hour, y = prop)) +
  geom_col() +
  facet_wrap(~ dow2) +
  theme_minimal() +
  labs(title = "Verteilung der HaNS-Logins nach Wochentagen und Uhrzeiten",
       x = "Wochentag",
       y = "Anteil") +
  coord_polar()

4.4 Anzahl der Visits nach Datum (Tagen) und Uhrzeit (bin2d)

Show the code
time2 <- 
time_visit_wday |> 
  ungroup() |> 
  mutate(date = as.Date(date_time))

time2 |> 
  ggplot(aes(x = date, y = hour)) +
  geom_bin2d(binwidth = c(1, 1)) + # (1 day, 1 hour)
  scale_x_date(date_breaks = "1 month") +
  theme(legend.position = "bottom") +
  scale_fill_viridis_c() +
  labs(caption = "Each x-bin maps to one week")

4.5 Anzahl der Visits nach Datum (Wochen) und Uhrzeit (bin2d)

Show the code
time2 |> 
  ggplot(aes(x = date, y = hour)) +
  geom_bin2d(binwidth = c(7, 1)) +  # 1 week, 1 hour
  scale_x_date(date_breaks = "1 week", date_labels = "%W") +
  theme(legend.position = "bottom") +
  scale_fill_viridis_c()  +
  labs(x = "Week number in 2023/2024",
       caption = "Each x-bin maps to one week")

4.6 Anzahl der Visits nach Datum (Wochen) und Wochentag (bin2d)

Show the code
time2 |> 
  ggplot(aes(x = date, y = dow)) +
  geom_bin2d(binwidth = c(7, 1)) +  # 1 week, 1 hour
  scale_x_date(date_breaks = "1 week", date_labels = "%W") +
  theme(legend.position = "bottom") +
  scale_fill_viridis_c()  +
  labs(x = "Week number in 2023/2024",
       caption = "Each x-bin maps to one week",
       y = "Day of Week") +
  scale_y_continuous(breaks = 1:7)

5 KI-Gebrauch

5.1 Welcher Anteil der Nutzenden klickt auf ein Wort im Transkript?

Show the code
tar_load(data_slim)
Show the code
data_slim |> 
  filter(type == "subtitle") |> 
  filter(!is.na(value) & value != "") |> 
  count(click_transcript_word = str_detect(value, "click_transcript_word")) |> 
  mutate(prop = n/sum(n)) |> 
  gt()
click_transcript_word n prop
FALSE 190359 0.998929488
TRUE 204 0.001070512

5.2 … Aufteilung nach Monaten

Show the code
tar_load(ai_transcript_clicks_per_month)
Show the code
ai_transcript_clicks_per_month |> 
  gt()
click_transcript_word n
2024-3
FALSE 48326
2024-4
FALSE 102929
2024-5
FALSE 31776
2024-6
FALSE 7532
NA-NA
FALSE 190359
TRUE 204